001    /*
002     * CharSequence.java
003     *
004     * Copyright 2003 Sergio Anibal de Carvalho Junior
005     *
006     * This file is part of NeoBio.
007     *
008     * NeoBio is free software; you can redistribute it and/or modify it under the terms of
009     * the GNU General Public License as published by the Free Software Foundation; either
010     * version 2 of the License, or (at your option) any later version.
011     *
012     * NeoBio is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
013     * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
014     * PURPOSE. See the GNU General Public License for more details.
015     *
016     * You should have received a copy of the GNU General Public License along with NeoBio;
017     * if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
018     * Boston, MA 02111-1307, USA.
019     *
020     * Proper attribution of the author as the source of the software would be appreciated.
021     *
022     * Sergio Anibal de Carvalho Junior             mailto:sergioanibaljr@users.sourceforge.net
023     * Department of Computer Science               http://www.dcs.kcl.ac.uk
024     * King's College London, UK                    http://www.kcl.ac.uk
025     *
026     * Please visit http://neobio.sourceforge.net
027     *
028     * This project was supervised by Professor Maxime Crochemore.
029     *
030     */
031    
032    package neobio.alignment;
033    
034    import java.io.Reader;
035    import java.io.BufferedReader;
036    import java.io.IOException;
037    
038    /**
039     * This class implements a sequence of characters stored as an array that provides random
040     * access to any position in constant time.
041     *
042     * <P>The input can come from any source, provided it is encapsulated in a proper
043     * <CODE>Reader</CODE> instance. The stream is expected to be ready (i.e. the next
044     * <CODE>read</CODE> operation must return the first character of the sequence) and it is
045     * not closed when its end is reached, so the client is allowed to reset it and maybe use
046     * it for another purpose.</P>
047     *
048     * <P>Sequences can contain letters only although lines started with the
049     * <CODE>COMMENT_CHAR</CODE> character ('>') are regarded as comments and are completely
050     * skipped. White spaces (including tabs, line feeds and carriage returns) are also
051     * ignored throughout.</P>
052     *
053     * <P>This class is used by two sequence alignment algorithms: {@linkplain SmithWaterman}
054     * and {@linkplain NeedlemanWunsch}.</P>
055     *
056     * @author Sergio A. de Carvalho Jr.
057     * @see SmithWaterman
058     * @see NeedlemanWunsch
059     */
060    public class CharSequence
061    {
062            /**
063             * The character used to start a comment line in a sequence file. When this character
064             * is found, the rest of the line is ignored.
065             */
066            protected static final char COMMENT_CHAR = '>';
067    
068            /**
069             * Stores the sequence as an array of characters.
070             */
071            protected char sequence[];
072    
073            /**
074             * Creates a new instance of a <CODE>CharSequence</CODE>, loading the sequence data
075             * from the <CODE>Reader</CODE> input stream.
076             *
077             * @param reader source of characters for this sequence
078             * @throws IOException if an I/O exception occurs when reading the input
079             * @throws InvalidSequenceException if the input does not contain a valid sequence
080             */
081            public CharSequence (Reader reader) throws IOException, InvalidSequenceException
082            {
083                    int ch;
084                    char c;
085    
086                    BufferedReader input = new BufferedReader(reader);
087    
088                    StringBuffer buf = new StringBuffer();
089    
090                    // read characters
091                    while ((ch = input.read()) != -1)
092                    {
093                            // conver to char
094                            c = (char) ch;
095    
096                            // skip line if comment character is found
097                            if (c == COMMENT_CHAR)
098                                    input.readLine();
099    
100                            // accept letters only
101                            else if (Character.isLetter(c))
102                                    buf.append(c);
103    
104                            // anything else, except whitespaces, will throw an exception
105                            else if (!Character.isWhitespace(c))
106                                    throw new InvalidSequenceException
107                                            ("Sequences can contain letters only.");
108                    }
109    
110                    // check if read anything!
111                    if (buf.length() > 0)
112                            sequence = new char[buf.length()];
113                    else
114                            throw new InvalidSequenceException ("Empty sequence.");
115    
116                    // copy data to
117                    buf.getChars(0, buf.length(), sequence, 0);
118            }
119    
120            /**
121             * Returns the number of characters of this sequence.
122             *
123             * @return int number of characters of this sequence
124             */
125            public int length ()
126            {
127                    return sequence.length;
128            }
129    
130            /**
131             * Returns the character at a given position. For the client, the first character is
132             * at position 1, while the last character is at position <CODE>length()</CODE>. This
133             * is convinient for sequence alignment algorithms based on a classic dynamic
134             * programming matrix since the sequences usually start at row/column 1. This method
135             * does not check boundaries, therefore an <CODE>ArrayIndexOutOfBoundsException</CODE>
136             * may be raised if <CODE>pos</CODE> is out of bounds.
137             *
138             * @param pos position of character (from 1 to <CODE>length()</CODE> inclusive)
139             * @return the character
140             */
141            public char charAt (int pos)
142            {
143                    // convert from one-based to zero-based index
144                    return sequence[pos-1];
145            }
146    
147            /**
148             * Returns a string representation of the sequence.
149             *
150             * @return a string representation of the sequence
151             */
152            public String toString ()
153            {
154                    return new String(sequence);
155            }
156    }